Introduction
# Welcome to the 'steroid data analysis' webpage!
# The procedures and explanations to make all the analysis and plots are in their individual chapters below.
# These methods could be also easily applied to other types of data sets and metabolites than 'steroids' and their respective metadata per se.
# In addition, there is a small 'disclaimer' also at the end of this webpage to emphasize that this site is mainly for educational purposes.
# Please let me know if you have any questions. For that, use the 'following' email: patati at the university of Turku
Loading Required R Packages
# echo=FALSE is too good
# Set library paths if needed
# .libPaths(c("C:/Program Files/R/R-4.4.1/library", .libPaths()))
# List of libraries to load (alphabetically sorted)
packages <- c("bigsnpr", "binilib", "brickster", "car", "censReg", "circlize", "ComplexHeatmap", "correlation",
"corrplot", "daiR", "datarium", "dmetar", "dplyr", "effsize", "extrafont", "forcats", "fs", "FSA",
"ggcorrplot", "ggforce", "ggforestplot", "ggplot2", "ggplotify", "ggpubr", "ggsankey", "ggsankeyfier",
"ggh4x", "ggtext", "glmnet", "grid", "Hmisc", "hrbrthemes", "igraph", "insight", "lavaan", "lmtest",
"lme4", "lsr", "magick", "magrittr", "Maaslin2", "mdatools", "mediation", "meta", "mgcv", "mlma",
"MOFA2", "pheatmap", "PerformanceAnalytics", "pathviewr", "plyr", "plotrix", "ppcor", "prettydoc",
"psych", "quantreg", "qpgraph", "ragg", "RColorBrewer", "rcompanion", "readxl", "remotes", "reshape2",
"rgl", "rmarkdown", "rmdformats", "rstatix", "scales", "scater", "scatterplot3d", "sjPlot", "stringr",
"superb", "tibble", "tidyverse", "tint", "tufte", "viridis", "xlsx")
# Load all libraries
invisible(lapply(packages, library, character.only = TRUE))
# Note: Do not load 'forestplot' as it conflicts with 'ggforestplot'
# Install packages if not already installed
# renv::install() # Installs from the basic R repository
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install(c("ComplexHeatmap", "DESeq2", "dmetar", "fgsea", "ggforestplot", "ggsankey", "limma", "Maaslin2", "metagenomeSeq", "MOFA2", "qpgraph", "scater", "scRNAseq", "sevenbridges"))
# remotes::install_github(c("davidsjoberg/ggsankey", "fossbert/binilib", "MathiasHarrer/dmetar", "mattflor/chorddiag", "NightingaleHealth/ggforestplot"))
# devtools::install_github("mattflor/chorddiag") # Alternative installation method
Importing Data and Metadata
#First set your data folder:
setwd("C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis")# or:"C:/Users/patati/Desktop/Turku/R" #check the wd with: here::here() #or getwd()
load("thereal.RData") #This is so to say real data, and it is not available here (at the site).
Setting Global Variables
options(scipen = 999) # Disable scientific notation
# rm(list = ls()) # Clear workspace; this should not be if you have the load above
thedate <- strftime(Sys.Date(), "%d%m%y") #Do not take the old date from the load...
date <- paste0('tikka', thedate) # Customize this as needed
# Example installation commands
# remotes::install_github("fossbert/binilib", force=TRUE)
# install.packages(c('tidyverse', 'tibble'))
# if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# BiocManager::install("Maaslin2")
# devtools::install_github("davidsjoberg/ggforestplot")
# remotes::install_version("insight", version = "0.20.5", repos = "http://cran.us.r-project.org", force=TRUE)
# font_import() # Import fonts if not already done
# loadfonts(device = "win") # Load fonts for Windows
# renv::status() # Check renv status
# library(rmarkdown); render("path/to/file.Rmd") # Render R Markdown document
# remove.packages("DelayedArray")
# BiocManager::install("DelayedArray")
# install.packages("Require") # Install 'Require' package
Making Boxplots
#https://r-graph-gallery.com/265-grouped-boxplot-with-ggplot2.html
#https://stackoverflow.com/questions/53724834/why-does-the-plot-size-differ-between-docx-and-html-in-rmarkdownrender
boxplots <- function(tvt, Group, Outcome, Out, oute, other) {
# Filter data based on gender
tvt <- tvt %>%
filter(if (Group == 'Male') Gender == 1 else if (Group == 'Female') Gender == 0 else TRUE)
# Prepare data for plotting
Steroid <- rep(colnames(tvt[, 9:28]), each = nrow(tvt))
data2 <- rep('Control', nrow(tvt))
num <- ifelse(Outcome == 'HOMA-IR', 1.5, min(tvt[[Outcome]]))
data2[tvt[[Outcome]] > num] <- 'Case'
Treatment <- data2
Concentration <- as.vector(unlist(tvt[, 9:28]))
data <- data.frame(Steroid, Treatment, Concentration)
data$Group <- 0
# Correct steroid names if the level exists
if ("17aOH-P4" %in% levels(data$Steroid)) {
data <- data %>%
mutate(Steroid = fct_recode(Steroid, '17a-OHP4' = '17aOH-P4'))
}
# Assign groups
rownames(groups2) <- 1:20
for (i in seq_len(nrow(groups2))) {
data[data$Steroid %in% groups2$Abbreviation[i], 'Group'] <- groups2$Group[i]
}
# Set plot title
title <- paste(Out, "'s Effect on Concentrations of Steroids in ", Group, sep = "")
# Define legend labels
e1 <- ifelse(num == 1.5, paste('Case (>', num, ')', sep = ""), paste('Case (>', 0, ')', sep = ""))
e2 <- ifelse(num == 1.5, paste('Control (<=', num, ')', sep = ""), paste('Control (=', 0, ')', sep = ""))
# Remove rows with NA concentrations
data <- data %>% filter(!is.na(Concentration))
# Create boxplot
p <- ggplot(data, aes(x = Steroid, y = Concentration, fill = Treatment)) +
geom_boxplot(notch = FALSE, notchwidth = 0.5, outlier.shape = 1, outlier.size = 2, coef = 1.5) +
theme_classic2() +
theme(axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2, size = 10.5),
axis.text = element_text(color = "black"),
panel.grid.minor = element_blank(),
text = element_text(size = 10.5, family = "Calibri"),
axis.title = element_text(size = 14),
plot.title = element_text(size = 14),
legend.text = element_text(size = 14),
legend.title = element_text(size = 14)) +
labs(x = "Steroids", y = "Log2 of Picomolar Concentrations", title = title) +
scale_fill_manual(values = c("orange", "blue"), name = oute, labels = c(e1, e2)) +
facet_grid(~Group, scales = "free_x", space = "free") +
stat_compare_means(hide.ns = TRUE, label = "p.signif", method = "wilcox.test",
symnum.args = list(cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("****", "***", "**", "*", "ns")),
size = 8, paired = FALSE, label.y = 15.5)
# Save plot as PNG
path <- "C:/Users/patati/Documents/GitHub/Steroid_Data_Analysis/"
pngfile <- fs::path(path, paste0(Group, Out, 'boxe', ".png"))
knitr::include_graphics(pngfile)
}
# Example usage
tv_half_log22[, '11-KA4'][tv_half_log22[, '11-KA4'] == min(tv_half_log22[, '11-KA4'])] <- median(tv_half_log22[, '11-KA4'])
other <- '261124'
ie <- tv_half_log22
hupo <- match(colnames(ie)[colnames(ie) %in% groups2[, 3]], groups2[, 3])
colnames(ie)[colnames(ie) %in% groups2[, 3]] <- groups2[hupo, 2]
windowsFonts(A = windowsFont("Calibri (Body)"))
# The significance levels are: '****<0.001', '***<0.01', '**<0.05', '*<0.1'
Outcome='Steatosis Grade';Out='Steatosis'; oute='Steatosis Grade';num=0;Group='All';boxplots(ie,Group,Outcome,Out,oute,other);Group='Female';boxplots(ie,Group,Outcome,Out,oute,other);Group='Male';boxplots(ie,Group,Outcome,Out,oute,other)
Outcome='Fibrosis Stage';Out='Fibrosis'; oute='Fibrosis Stage';num=0;Group='All';boxplots(ie,Group,Outcome,Out,oute,other);Group='Female';boxplots(ie,Group,Outcome,Out,oute,other);Group='Male';boxplots(ie,Group,Outcome,Out,oute,other)
# https://www.elsevier.es/en-revista-annals-hepatology-16-articulo-assessment-hepatic-fibrosis-necroinflammation-among-S1665268119314590 #So it is in grade
Outcome='Necroinflammation';Out='Necroinflammation'; oute='Necroinflammation Grade';num=0;Group='All';boxplots(ie,Group,Outcome,Out,oute,other);Group='Female';boxplots(ie,Group,Outcome,Out,oute,other);Group='Male';boxplots(ie,Group,Outcome,Out,oute,other)
Outcome='HOMA-IR';Out='HOMA-IR'; oute='HOMA-IR';num=1.5 ;Group='All';boxplots(ie,Group,Outcome,Out,oute,other);Group='Female';boxplots(ie,Group,Outcome,Out,oute,other);Group='Male';boxplots(ie,Group,Outcome,Out,oute,other)
Making Forest Plots
# Define the NAFLD dataset by selecting the first 28 columns from tv
NAFLD <- tv[, 1:28]
# Convert specific columns to binary values using vectorized operations
cols_to_binary <- c(5, 6, 7)
NAFLD[, cols_to_binary] <- (NAFLD[, cols_to_binary] > 0) * 1
# Convert column 8 to binary based on the threshold of 1.5
NAFLD[, 8] <- (NAFLD[, 8] > 1.5) * 1
# Clean column names to remove special characters and make them consistent
patterns <- c("-", "/", "11", "17", "#")
replacements <- c(".", ".", "X11", "X17", ".") #?
# Ensure patterns and replacements are correctly paired
if (length(patterns) == length(replacements)) {
for (i in seq_along(patterns)) {
colnames(NAFLD) <- gsub(patterns[i], replacements[i], colnames(NAFLD))}} else {
stop("Patterns and replacements vectors must be of the same length.")}
# This works with the autoscaled (raw if loge=1 and remove 1 in the means) data NAFLD as well...
pre_errors_2=function(NAFLD,Outcome,Group,name,ordera,oute,first,e,xlim) { # Group='Female'
# Filter data based on the 'Group' variable
NAFLDo <- switch(Group,
"Male" = NAFLD[NAFLD[,'SEX.1F.2M'] == 2, ],
"Female" = NAFLD[NAFLD[,'SEX.1F.2M'] == 1, ],
"All" = NAFLD)
# Initialize vectors to store sample data and counts
sample_data <- list()
n0 <- n1 <- 0
# Loop through the two groups (Outcome == 0 and Outcome > 0)
for (i in 1:2) {
SG0 <- if (i == 1) {
NAFLDo[NAFLDo[, Outcome] == 0, ]
} else {
NAFLDo[NAFLDo[, Outcome] > 0, ]}
# Store the count of samples in each group
if (i == 1) {
n0 <- nrow(SG0)
} else {
n1 <- nrow(SG0)}
# Calculate medians and standard deviations for columns 9 to 28
means <- apply(SG0[, 9:28], 2, median, na.rm = TRUE)
sds <- apply(SG0[, 9:28], 2, sd, na.rm = TRUE)
# Calculate error margins
error_lower <- means - sds
error_upper <- means + sds
error <- sds
# Append results to sample_data
sample_data[[i]] <- data.frame(study = colnames(NAFLD[, 9:28]),
index = colnames(NAFLD[, 9:28]),
result = means,
error = error)}
df=data.frame(sample_data) #
# Calculate p-values using Wilcoxon test for columns 9 to 28
ps <- sapply(9:28, function(j) {
xnam <- colnames(NAFLDo)[j]
fmla <- as.formula(paste(xnam, "~", Outcome))
wilcox.test(fmla, data = NAFLDo, exact = FALSE)$p.value})
#https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
# Calculate the ratio of results and log-transform
a <- df[df[, 1] == e, 'result.1'] / df[df[, 1] == e, 'result']
v2 <- data.frame(log(df$result.1 / df$result))
# Rename columns and clean up variable names
v2$result <- v2[, 1]
v2$name <- df$study
v2 <- v2[, 2:3]
v2$name <- gsub("\\.", "-", v2$name)
v2$name <- gsub("X11", "11", v2$name)
v2$name <- gsub("X17", "17", v2$name)
v2$name[v2$name == "T-Epi-T"] <- "T/Epi-T"
v2$pval <- ps
# Calculate result_pure and error
v2$result_pure <- df$result.1 / df$result
v2$error <- (abs((1 / df$result) * df$error.1) + abs((df$result.1 / df$result^2) * df$error)) / nrow(NAFLDo) * 1.64
# Adjust error values
v2$error <- ifelse(v2$error > (median(v2$error) + sd(v2$error)), median(v2$error) * 1.25, v2$error)
# Calculate error bounds and log-transformed values
v2$errord1a <- v2$result_pure - v2$error
v2$errord2a <- v2$result_pure + v2$error
v2$errord1 <- log(v2$errord1a)
v2$errord2 <- log(v2$errord2a)
v2$result <- log(v2$result_pure)
v2$Control <- df$result
v2$Case <- df$result.1
# Add p-values and significance
v2$pval0 <- v2$pval
v2$pval1 <- v2$pval
v2$Significance0 <- ifelse(v2$pval0 < 0.1, 'Yes', 'No')
v2$Color0 <- ifelse(v2$pval0 < 0.1, 'blue', 'grey')
v2$Significance1 <- ifelse(v2$pval1 < 0.1, 'Yes', 'No')
v2$Color1 <- ifelse(v2$pval1 < 0.1, 'blue', 'grey')
# Merge with group data and sort
gn <- groups[groups$Abbreviation != 'F', c('Group', 'Abbreviation')]
gn <- gn[order(gn$Abbreviation), ]
v2 <- v2[order(v2$name), ]
v2 <- cbind(v2, gn[order(gn$Abbreviation), ])
v2 <- v2[order(-v2$result), ]
xlab = "Autoscaled Concentrations (SE)" #xlab = "Raw Concentrations in Log10 Scale (SE)"}
xlim=c(min(v2$errord1),max(v2$errord2))
#Occasionally: xlim=c(min(v2$result)*1.1,max(v2$result)*1.1) # if (xlim[2]>1) {xlim[2]=1};# if (xlim[1] < -0.75) {xlim[1]=-0.75};
# Create forest plot
plote2 <- forestplot(df = v2,
estimate = result,
se = 0,
pvalue = pval1,
psignif = 0.1,
xlim = xlim,
xlab = 'Logged Ratio between Raw Concentrations of Case and Control with 90% CI',
ylab = 'Steroid Groups',
title = '',
colour = Significance1) +
ggforce::facet_col(facets = ~Group, scales = "free_y", space = "free", strip.position = 'left') +
geom_errorbarh(aes(xmin = errord1, xmax = errord2, height = .0, colour = Significance1))
# Set color palette
hp <- if (sum(v2$Significance1 == 'Yes') == 20) c('blue', 'blue') else c('#999999', 'blue')
# Order factor levels based on Group and first
if (Group=='All' & first==TRUE) {ordera=rev(groups$Abbreviation)#v2$name[order(v2$result)]; #
plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='All' & first==FALSE) {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='Female') {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)} else if
(Group=='Male') {plote2[["data"]][["name"]]=factor(plote2[["data"]][["name"]], levels = ordera)}
#https://www.r-bloggers.com/2020/03/how-to-standardize-group-colors-in-data-visualizations-in-r/
plote2$layers[[1]]$aes_params$odd <- "#00000000" #https://stackoverflow.com/questions/71745719/how-to-control-stripe-transparency-using-ggforestplot-geom-stripes
v2$Group2=v2$Group
v2 <- transform(v2,Group2 = as.numeric(as.factor(Group2)))
v2$facet_fill_color <- c("red", "green", "blue", "yellow", "brown")[v2$Group2]
# Create plot with custom themes
jopon <- plote2 + theme(axis.text.y = element_blank()) + theme_classic2()
jopon2 <- jopon + geom_point(aes(colour = factor(Significance1)), colour = v2$Color1) +
scale_color_manual(values = hp) + theme(legend.position = "none") + theme(strip.text.y = element_text(size = -Inf))
# Customize facet strip colors
g <- ggplot_gtable(ggplot_build(jopon2))
stripr <- which(grepl('strip-l', g$layout$name))
fills <- c("red", "green", "blue", "yellow", "brown")
for (i in seq_along(stripr)) {
j <- which(grepl('rect', g$grobs[[stripr[i]]]$grobs[[1]]$childrenOrder))
g$grobs[[stripr[i]]]$grobs[[1]]$children[[j]]$gp$fill <- fills[i]}
# grid::grid.draw(g)
# Save plot as JPEG and convert to PDF and SVG
jpeg(paste(name, "divi.jpg"), width = 7500, height = 11000, quality = 100, pointsize = 16, res = 1000)
print(grid::grid.draw(g))
dev.off()
daiR::image_to_pdf(paste(name, "divi.jpg"), pdf_name = paste0(paste(name, "divi.jpg"), '.pdf'))
my_image <- image_read(paste(name, "divi.jpg"))
my_svg <- image_convert(my_image, format = "svg")
image_write(my_svg, paste(name, "divi.svg"))
return(ordera) #If you do not want to have 'null' to the Rmarkdown/html take this away
}
# This is with first(!!). Use it.
Outcome='Steatosis.Grade.0.To.3';Out='Steatosis'; oute='Steatosis';first=TRUE; e='P4';ordera=c();
Group='All';name1=paste("Forest plot of",Group, "Steroid Ratios in",Out);
hel=pre_errors_2(NAFLD,Outcome,Group,name1,ordera,oute,first,e,xlim)
# #Afterwards:
first=FALSE;
Group='Female';name2=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name2,ordera=hel,oute,first,e,xlim)
Group='Male'; name3=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name3,ordera=hel,oute,first,e,xlim)
#
Outcome='Fibrosis.Stage.0.to.4'; Out='Fibrosis';oute='Fibrosis';
Group='All'; name4=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name4,ordera=hel,oute,first,e,xlim)
Group='Female';name5=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name5,ordera=hel,oute,first,e,xlim)
Group='Male'; name6=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name6,ordera=hel,oute,first,e,xlim)
#
Outcome='Necroinflammation'; Out='Necroinflammation';oute='Necroinflammation';
Group='All'; name7=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name7,ordera=hel,oute,first,e,xlim) #not the very first though...
Group='Female';name8=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name8,ordera=hel,oute,first,e,xlim)
Group='Male'; name9=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name9,ordera=hel,oute,first,e,xlim)
#
Outcome='HOMA.IR';Out='HOMA-IR';oute='HOMAIR';
Group='All';name10=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name10,ordera=hel,oute,first,e,xlim) #not the very first though...
Group='Female';name11=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name11,ordera=hel,oute,first,e,xlim)
Group='Male'; name12=paste("Forest plot of",Group, "Steroid Ratios in",Out);
pre_errors_2(NAFLD,Outcome,Group,name12,ordera=hel,oute,first,e,xlim)
# Fyi: I was able to revise some of the above codes with Copilot...
Forest plot of All Steroid Ratios in Steatosis
Forest plot of Female Steroid Ratios in Steatosis
Forest plot of Male Steroid Ratios in Steatosis